!mkdir -p input
# !gunzip input/Mesculenta_305_v6.1.cds_primaryTranscriptOnly.fa.gz
# !gunzip input/Mesculenta_671_v8.1.cds_primaryTranscriptOnly.fa.gz
num_core = 20
blast_type = "nucl"
input_file_old = "input/Mesculenta_305_v6.1.cds_primaryTranscriptOnly.fa"
input_file_new = "input/Mesculenta_671_v8.1.cds_primaryTranscriptOnly.fa"
# Create database index sequence
!makeblastdb -in $input_file_new -dbtype $blast_type -out input/new -title new
# Runing BLAST
!blastn -db input/new \
-query $input_file_old \
-evalue 1e-10 \
-subject_besthit \
-max_hsps 1 \
-max_target_seqs 1 \
-num_threads $num_core \
-outfmt "6 std qcovhsp" \
-out out_blastn_db_new__query_old.txt
import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.express as px
first_BLASTp = pd.read_table("out_blastn_db_new__query_old.txt",
names=["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qcovhsp"],
header=None)
print("First BLAST result\nNumber of query sequences (new): %d\nNumber of subject sequences (old): %d" % (len(first_BLASTp.qseqid.unique()), len(first_BLASTp.sseqid.unique())))
# sns.jointplot(data=first_BLASTp, x="qcovhsp", y="pident")
import plotly.express as px
fig = px.scatter(
first_BLASTp,
x="qcovhsp",
y="pident",
marginal_x="histogram",
marginal_y="histogram",
hover_data = {
"qseqid": True,
"sseqid": True,
},
width=500,
height=500)
fig.show()
first_BLASTp_filter_criteria = pd.DataFrame(columns = ["pident", "qcovhsp", "num_genes"])
for pident in range(0, 101, 5):
for qcovhsp in range(0, 101, 5):
first_BLASTp_filter_criteria = first_BLASTp_filter_criteria \
.append({'pident': pident,
'qcovhsp': qcovhsp,
'num_genes': first_BLASTp[(first_BLASTp.pident >= pident) & (first_BLASTp.qcovhsp >= qcovhsp)].shape[0]
}, ignore_index=True)
df = first_BLASTp_filter_criteria.pivot(index='pident', columns='qcovhsp', values='num_genes')
fig = px.imshow(df,
labels=dict(x="> % Query coverage grather than", y="> % Identity grather than", color="Number of genes"),
width=500, height=500)
fig.show()
first_BLASTp = first_BLASTp[(first_BLASTp.qcovhsp > 80) & (first_BLASTp.pident > 80)]
print("First BLAST result\nNumber of query sequences (new): %d\nNumber of subject sequences (old): %d" % (len(first_BLASTp.qseqid.unique()), len(first_BLASTp.sseqid.unique())))
!makeblastdb -in $input_file_old -dbtype $blast_type -out input/old -title old
!blastn -db input/old \
-query $input_file_new \
-evalue 1e-10 \
-subject_besthit \
-max_hsps 1 \
-max_target_seqs 1 \
-num_threads 20 \
-outfmt "6 std qcovhsp" \
-out out_blastn_db_old__query_new.txt
second_BLASTp = pd.read_table("out_blastn_db_old__query_new.txt",
names=["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qcovhsp"],
header=None)
print("First BLAST result\nNumber of query sequences (old): %d\nNumber of subject sequences (new): %d" % (len(second_BLASTp.qseqid.unique()), len(second_BLASTp.sseqid.unique())))
# sns.jointplot(data=second_BLASTp, x="qcovhsp", y="pident")
import plotly.express as px
fig = px.scatter(
second_BLASTp,
x="qcovhsp",
y="pident",
marginal_x="histogram",
marginal_y="histogram",
hover_data = {
"qseqid": True,
"sseqid": True,
},
width=500,
height=500)
fig.show()
second_BLASTp_filter_criteria = pd.DataFrame(columns = ["pident", "qcovhsp", "num_genes"])
for pident in range(0, 101, 5):
for qcovhsp in range(0, 101, 5):
second_BLASTp_filter_criteria = second_BLASTp_filter_criteria \
.append({'pident': pident,
'qcovhsp': qcovhsp,
'num_genes': second_BLASTp[(second_BLASTp.pident >= pident) & (second_BLASTp.qcovhsp >= qcovhsp)].shape[0]
}, ignore_index=True)
df = second_BLASTp_filter_criteria.pivot(index='pident', columns='qcovhsp', values='num_genes')
fig = px.imshow(df,
labels=dict(x="% Query coverage grather than", y="> % Identity grather than", color="Number of genes"),
width=500,
height=500)
fig.show()
second_BLASTp = second_BLASTp[(second_BLASTp.qcovhsp > 80) & (second_BLASTp.pident > 80)]
print("First BLAST result\nNumber of query sequences (old): %d\nNumber of subject sequences (new): %d" % (len(second_BLASTp.qseqid.unique()), len(second_BLASTp.sseqid.unique())))
DBH_blast_result = first_BLASTp \
.filter(["qseqid", "sseqid"]) \
.drop_duplicates() \
.merge(second_BLASTp.filter(["qseqid", "sseqid"]).drop_duplicates(),
how='left',
left_on="qseqid",
right_on="sseqid",
suffixes=["_1st_BLASTp", "_2nd_BLASTp"]
) \
.assign(DBH = lambda x: (x.sseqid_1st_BLASTp == x.qseqid_2nd_BLASTp) & (x.qseqid_1st_BLASTp == x.sseqid_2nd_BLASTp)) \
.sort_values('DBH', ascending=False) \
.filter(["qseqid_1st_BLASTp", "sseqid_1st_BLASTp", "DBH"]) \
.rename(columns={"qseqid_1st_BLASTp": "old_transcript_id",
"sseqid_1st_BLASTp": "new_transcript_id"}) \
.assign(old_gene_id = lambda x: x.old_transcript_id.replace(".\d+$", "", regex=True),
new_gene_id = lambda x: x.new_transcript_id.replace(".\d+$", "", regex=True)) \
.assign(gene_match=lambda x: x.new_gene_id == x.old_gene_id)
DBH_blast_result
DBH_blast_result \
.groupby('DBH') \
.size()
DBH_blast_result.groupby(["DBH", "gene_match"]).size()
!wget -q https://raw.githubusercontent.com/evolu-tion/GenomeManagement/master/seq_manage.py
from seq_manage import Gff_manager
def getOrderGeneFromChr(fileName):
gff = Gff_manager(fileName)
df = pd.DataFrame(gff.getTableSpecificType("gene"),
columns=["chr", "source", "feature", "start", "end", "score", "strand", "frame", "attr"])
df[["ID", "GeneID", "ancestorIdentifier"]] = pd.DataFrame(df["attr"].to_list())
df = df.replace('^[A-Za-z]+=','',regex=True) \
.drop(columns=["attr"]) \
.sort_values(by = ["chr", "start", "GeneID"], ignore_index = True) \
.rename_axis("Order").reset_index()
return df[["GeneID", "Order"]].set_index("GeneID")
tbl_gene_new = getOrderGeneFromChr("input/Mesculenta_671_v8.1.gene.gff3.gz")
tbl_gene_old = getOrderGeneFromChr("input/Mesculenta_305_v6.1.gene.gff3.gz")
DBH_blast_result = DBH_blast_result \
.merge(tbl_gene_new,
how = 'left',
left_on = "new_gene_id",
right_index = True) \
.merge(tbl_gene_old,
how = 'left',
left_on = "old_gene_id",
right_index = True,
suffixes = ("_new", "_old")) \
.assign(new_chr = DBH_blast_result.new_gene_id.replace('.\d+$', '', regex=True).replace('Manes.', 'Chr', regex=True))
import plotly.express as px
fig = px.scatter(
DBH_blast_result.sort_values("new_transcript_id"),
x="Order_new",
y="Order_old",
color = "new_chr",
hover_data = {
"new_transcript_id" : True,
"old_transcript_id" : True,
"Order_new": False,
"Order_old": False,
"new_chr": False,
"DBH": True
},
width=600,
height=500,
labels=dict(
Order_new="Gene ID (New version)",
Order_old="Gene ID (Old version)",
new_chr="Chromosome of new version"
)
)
fig.show()
DBH_blast_result.to_csv("DBH_BLASTn_old_new.txt", sep='\t', index=False)